Flora Prepper Model Evaluation

Initialize the environment

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import numpy as np
import pandas as pd

# Import custom modelling code
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.models.run_model import *
from src.visualization.visualize import *
import src.features as features
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

Import and visualize the data

Visualize counts of training datasets

In [2]:
flora_data_frame = pd.read_csv("../data/processed/flora_data_frame.csv", index_col=0)
flora_data_frame['dataset_name'].value_counts().plot.bar()
plt.show()
flora_data_frame[['classification', 'dataset_name', 'text']] \
.groupby(['classification', 'dataset_name']).count().plot.bar()
Out[2]:
<matplotlib.axes._subplots.AxesSubplot at 0x1055b7f10>

View a word cloud for all text data

Text is processed using the same custom (bare-bones) tokenizer and stopwords used to train the model.

In [3]:
tokenized_stop_words = features.prepare_stop_words(custom_stop_words=["unknown", "accepted", "synonym",
                                                             "basionym", "source",
                                                             "note", "notes", "morphology"])  # Find a way to keep numbers and elipses!
# morphology word is an artefact of the FNA xml key statements. 
# TODO Return to this and fix
text = " ".join(text_string for text_string in flora_data_frame.text)
visualize_words(text, tokenized_stop_words)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[(',', 20702), (';', 11674), ('.', 8384), ('mm', 4076), (')', 3248), ('(', 3247), ('cm', 1716), ('long', 1613), ('2', 1395), ('usually', 1344), ('1', 1325), ('leaves', 1073), ('3', 995), ('blades', 918), ('glabrous', 820), ('margins', 778), ('wide', 760), ('Leaves', 739), ('leaf', 733), ('5', 714), ('4', 702), ('Plants', 603), ('×', 590), ('sometimes', 586), ('species', 576), ('genus', 575), ('base', 530), ('green', 527), (':', 520), ('erect', 508)]

What does the word cloud look like if we apply a strict preprocessing cleaning regime?

In [4]:
tokenized_stop_words_clean = features.prepare_stop_words(custom_stop_words=["unknown", "accepted", "synonym",
                                                             "basionym", "source",
                                                             "note", "notes", "morphology", "fna_id"], clean=True) 
visualize_words(text, tokenized_stop_words_clean, cleanup=True)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[('leav', 1814), ('long', 1621), ('blade', 1432), ('usual', 1345), ('leaf', 1211), ('stem', 1141), ('flower', 1075), ('plant', 1049), ('margin', 894), ('glabrou', 820), ('wide', 815), ('fruit', 811), ('base', 722), ('branch', 712), ('infloresc', 668), ('speci', 610), ('basal', 592), ('sometim', 588), ('genu', 577), ('petal', 560), ('green', 551), ('rare', 549), ('distal', 545), ('erect', 517), ('sepal', 489), ('brown', 488), ('often', 475), ('white', 459), ('den', 458), ('spikelet', 458)]

Generate word clouds by classification.

Are there any noticeable differences between the words used most frequently between the classifications?

In [5]:
taxon_identification = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.classification == "taxon_identification"].text)
morphology = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.classification == "morphology"].text)
key = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.classification == "key"].text)
habitat = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.classification == "habitat"].text)

Taxon identification

In [6]:
visualize_words(taxon_identification, tokenized_stop_words, color="blue")
visualize_words(taxon_identification, tokenized_stop_words_clean, cleanup=True, color="blue")
[(';', 754), ('.', 651), ('genus', 567), ('species', 530), ('(', 336), (')', 335), ('family', 279), (',', 139), ('variety', 134), (':', 117), ('fna_id', 96), ('L.', 88), ('tribe', 84), ('subfamily', 70), ('&', 63), ('Linnaeus', 53), ('A.', 51), ('Carex', 44), ('R.', 41), ('asteraceae', 38), ('P.', 36), ('Jussieu', 35), ('ex', 34), ('1754', 34), ('section', 34), ('Gray', 33), ('J.', 32), ('de', 32), ('subspecies', 32), ('sedge', 30)]
[('genu', 567), ('speci', 530), ('famili', 279), ('varieti', 134), ('tribe', 84), ('subfamili', 70), ('linnaeu', 53), ('asteracea', 50), ('carex', 44), ('sedg', 42), ('jussieu', 35), ('grass', 35), ('section', 34), ('gray', 33), ('subspeci', 32), ('candol', 30), ('specialstatu', 27), ('poacea', 23), ('subgenu', 22), ('green', 21), ('martinov', 19), ('rosacea', 19), ('cassini', 17), ('plate', 17), ('aster', 16), ('brassicacea', 16), ('juncu', 16), ('rush', 16), ('hooker', 16), ('schimper', 16)]

Morphology

In [7]:
visualize_words(morphology, tokenized_stop_words, color="red")
visualize_words(morphology, tokenized_stop_words_clean, cleanup=True, color="red")
[(',', 9581), (';', 3186), ('.', 3120), ('mm', 2062), ('(', 1162), (')', 1162), ('cm', 961), ('long', 861), ('×', 432), ('glabrous', 374), ('usually', 348), ('margins', 320), ('leaves', 298), ('wide', 288), ('erect', 287), ('brown', 265), ('base', 252), ('Leaves', 251), (':', 249), ('apex', 234), ('sometimes', 227), ('Plants', 227), ('blades', 225), ('green', 221), ('pubescent', 198), ('Flowers', 192), ('often', 192), ('blade', 186), ('=', 174), ('branches', 170)]
[('long', 862), ('leav', 550), ('flower', 447), ('blade', 415), ('stem', 393), ('glabrou', 374), ('margin', 350), ('usual', 349), ('branch', 342), ('base', 311), ('wide', 310), ('erect', 290), ('plant', 285), ('brown', 276), ('apex', 234), ('sometim', 228), ('green', 224), ('basal', 222), ('infloresc', 206), ('pubesc', 199), ('often', 196), ('fruit', 194), ('distal', 184), ('surfac', 182), ('den', 179), ('sheath', 174), ('entir', 170), ('ovat', 163), ('acut', 157), ('white', 156)]

Keys

In [8]:
visualize_words(key, tokenized_stop_words, color="yellow")
visualize_words(key, tokenized_stop_words_clean, cleanup=True, color="yellow")
[(',', 8127), (';', 6728), ('.', 4054), ('mm', 2011), (')', 1664), ('(', 1662), ('2', 1205), ('1', 1133), ('usually', 994), ('3', 869), ('leaves', 775), ('cm', 753), ('long', 750), ('blades', 691), ('leaf', 658), ('4', 625), ('5', 603), ('Leaves', 488), ('wide', 470), ('margins', 448), ('glabrous', 446), ('6', 444), ('Leaf', 440), ('...', 427), ('Plants', 375), ('7', 374), ('plants', 359), ('sometimes', 357), ('8', 320), ('10', 318)]
[('leav', 1264), ('leaf', 1098), ('blade', 1015), ('usual', 994), ('plant', 759), ('long', 757), ('stem', 748), ('flower', 627), ('fruit', 616), ('margin', 533), ('wide', 500), ('infloresc', 462), ('glabrou', 446), ('petal', 441), ('base', 405), ('basal', 370), ('branch', 368), ('distal', 361), ('sometim', 357), ('sepal', 356), ('spikelet', 323), ('yellow', 318), ('green', 306), ('pistil', 305), ('scale', 300), ('white', 299), ('spike', 296), ('lobe', 295), ('cell', 288), ('rare', 284)]

Habitat

In [9]:
visualize_words(habitat, tokenized_stop_words, color="purple")
visualize_words(habitat, tokenized_stop_words_clean, cleanup=True, color="purple")
[(',', 2855), (';', 1006), ('.', 580), ('BC', 273), ('zones', 183), ('E', 172), ('N', 164), ('montane', 155), ('lowland', 127), ('CA', 120), ('rare', 119), ('slopes', 117), ('meadows', 113), ('AK', 104), ('(', 87), (')', 87), ('steppe', 86), ('forests', 86), ('infrequent', 81), ('frequent', 81), ('YT', 80), ('Moist', 76), ('dry', 76), ('common', 73), ('America', 72), ('alpine', 70), ('NT', 70), ('zone', 70), ('Mesic', 70), ('Dry', 66)]
[('zone', 253), ('montan', 156), ('lowland', 127), ('rare', 122), ('slope', 117), ('forest', 117), ('moist', 116), ('meadow', 113), ('mesic', 109), ('island', 100), ('open', 88), ('stepp', 87), ('frequent', 84), ('infrequ', 83), ('introduc', 75), ('common', 73), ('america', 72), ('rocki', 71), ('alpin', 70), ('mountain', 61), ('calif', 58), ('mexico', 58), ('eurasia', 55), ('subalpin', 55), ('throughout', 53), ('area', 50), ('europ', 47), ('coastcascad', 47), ('vancouv', 45), ('south', 44)]

Word clouds by flora source

Are there differences between training sets in the most commonly used words?

In [10]:
bc_text = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.dataset_name == 'bc'].text if text_string not in tokenized_stop_words)
budds_text = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.dataset_name == 'budds'].text if text_string not in tokenized_stop_words)
fna_text = " ".join(text_string for text_string in flora_data_frame[flora_data_frame.dataset_name == 'fna'].text if text_string not in tokenized_stop_words)

BC

In [11]:
visualize_words(bc_text, tokenized_stop_words)
visualize_words(bc_text, tokenized_stop_words_clean, cleanup=True)
[(',', 2426), (';', 970), ('.', 752), ('...', 427), ('long', 280), ('BC', 274), ('mm', 221), ('(', 216), (')', 216), ('zones', 181), ('E', 172), ('N', 164), ('cm', 155), ('montane', 150), ('lowland', 127), ('CA', 120), ('rare', 119), ('AK', 104), ('meadows', 93), ('slopes', 86), ('steppe', 85), ('leaves', 83), ('infrequent', 81), ('frequent', 81), ('YT', 80), ('tall', 78), ('common', 76), ('Moist', 72), ('stems', 72), ('NT', 70)]
[('long', 280), ('zone', 251), ('montan', 150), ('leav', 144), ('rare', 136), ('lowland', 127), ('flower', 124), ('mesic', 105), ('stem', 105), ('moist', 103), ('forest', 95), ('meadow', 93), ('slope', 86), ('stepp', 85), ('infrequ', 83), ('frequent', 83), ('island', 80), ('tall', 78), ('open', 77), ('common', 76), ('fruit', 72), ('wide', 70), ('gener', 66), ('lower', 64), ('alpin', 63), ('famili', 63), ('usual', 58), ('mountain', 56), ('throughout', 54), ('le', 53)]

FNA

In [12]:
visualize_words(fna_text, tokenized_stop_words)
visualize_words(fna_text, tokenized_stop_words_clean, cleanup=True)
[(',', 15237), (';', 8591), ('.', 5658), ('mm', 3090), ('(', 2851), (')', 2851), ('usually', 1138), ('cm', 1106), ('1', 1035), ('2', 1030), ('blades', 727), ('leaves', 723), ('3', 715), ('glabrous', 676), ('margins', 675), ('leaf', 665), ('×', 590), ('genus', 570), ('sometimes', 559), ('species', 544), ('4', 501), ('5', 501), (':', 486), ('wide', 486), ('Leaf', 451), ('Leaves', 439), ('±', 428), ('apex', 428), ('blade', 425), ('yellow', 391)]
[('blade', 1182), ('leav', 1163), ('usual', 1138), ('leaf', 1116), ('stem', 882), ('margin', 763), ('flower', 687), ('glabrou', 676), ('fruit', 643), ('plant', 634), ('genu', 572), ('sometim', 561), ('speci', 546), ('distal', 545), ('base', 535), ('wide', 533), ('basal', 530), ('branch', 526), ('petal', 493), ('infloresc', 492), ('sepal', 439), ('surfac', 431), ('apex', 430), ('proxim', 426), ('cell', 422), ('yellow', 396), ('white', 385), ('green', 379), ('lobe', 378), ('rare', 377)]

Budds

In [13]:
visualize_words(budds_text, tokenized_stop_words)
visualize_words(budds_text, tokenized_stop_words_clean, cleanup=True)
[(',', 3039), (';', 2113), ('.', 1977), ('long', 982), ('mm', 765), ('cm', 455), ('2', 316), ('Plants', 270), ('leaves', 267), ('3', 247), ('Leaves', 239), ('1', 235), ('wide', 209), ('high', 198), ('scales', 187), ('C.', 184), (')', 181), ('(', 180), ('4', 173), ('5', 171), ('blades', 170), ('p.', 168), ('brown', 162), ('perigynia', 155), ('spikes', 150), ('culms', 150), ('usually', 148), ('green', 146), ('pubescent', 136), ('base', 132)]
[('long', 985), ('leav', 507), ('plant', 384), ('spike', 287), ('perigynia', 280), ('flower', 264), ('blade', 217), ('lemma', 217), ('spikelet', 214), ('wide', 212), ('culm', 207), ('scale', 206), ('glume', 198), ('high', 198), ('sheath', 181), ('brown', 162), ('stem', 154), ('green', 152), ('usual', 149), ('pistil', 148), ('branch', 146), ('pubesc', 144), ('infloresc', 140), ('panicl', 138), ('base', 137), ('rhizom', 115), ('erect', 112), ('tuft', 111), ('glabrou', 109), ('section', 104)]

Visualize distinctive words using tf-idf

In [14]:
custom_vec = TfidfVectorizer(lowercase=True, tokenizer=features.flora_tokenizer, stop_words=tokenized_stop_words, ngram_range=(1, 1))
text_counts = custom_vec.fit_transform(flora_data_frame['text'])  # Build TF-IDF Matrix

scores = zip(custom_vec.get_feature_names(), np.asarray(text_counts.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
#for item in sorted_scores:
 #   print("{0:10} Score: {1}".format(item[0], item[1]))
    
sorted_scores_df = pd.DataFrame(sorted_scores, columns=['word', 'score']).iloc[:50]
sorted_scores_df.plot.bar(x='word', y='score')
plt.show()

Distinctive words with new cleaning regime

In [15]:
custom_vec = TfidfVectorizer(lowercase=True, tokenizer=features.flora_tokenizer_clean, stop_words=tokenized_stop_words_clean, ngram_range=(1, 1))
text_counts = custom_vec.fit_transform(flora_data_frame['text'])  # Build TF-IDF Matrix

scores = zip(custom_vec.get_feature_names(), np.asarray(text_counts.sum(axis=0)).ravel())
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
#for item in sorted_scores:
 #   print("{0:10} Score: {1}".format(item[0], item[1]))
    
sorted_scores_df = pd.DataFrame(sorted_scores, columns=['word', 'score']).iloc[:50]
sorted_scores_df.plot.bar(x='word', y='score')
plt.show()
/Users/jocelynpender/miniconda3/envs/flora-prepper/lib/python3.7/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['becau'] not in stop_words.
  'stop_words.' % sorted(inconsistent))

Run a DTM based model and a TFIDF based model and review accuracy

In [16]:
# ==== DTM =====
dtm_text_counts = build_dtm_text_counts(features.flora_tokenizer, tokenized_stop_words, flora_data_frame)
dtm_y_test, dtm_predictions = run_model(dtm_text_counts, flora_data_frame)

# ==== TFIDF =====
tfidf_text_counts = build_tfidf_text_counts(features.flora_tokenizer, tokenized_stop_words, flora_data_frame)
tfidf_y_test, tfidf_predictions = run_model(tfidf_text_counts, flora_data_frame)
MultinomialNB Accuracy: 0.9732824427480916
MultinomialNB Accuracy: 0.8994910941475827

View classified statements

In [17]:
dtm_y_test_df = pd.DataFrame(dtm_y_test).reset_index()
dtm_predictions_series = pd.Series(dtm_predictions)
results = pd.concat([dtm_y_test_df, dtm_predictions_series], axis=1)
results.rename(columns={0: 'predictions'}, inplace=True)
results = results.set_index('index')
results_flora_data_frame = pd.concat([results, flora_data_frame], axis=1)
results_flora_data_frame
Out[17]:
classification predictions dataset_name row_id classification length row species text
0 NaN NaN bc 2538 morphology 132 NaN Hordeum vulgare Leaves Sheaths open, smooth; blades mostly (5)...
1 key key fna 2670 key 223 NaN NaN 1 Radial spines usually 8-15 per areole; fruit...
2 NaN NaN fna 2770 key 564 NaN NaN 1 Leaves at flowering all or mostly cauline mo...
3 habitat habitat bc 6561 habitat 164 NaN Spergularia salina var. salina Wet to moist coastal mudflats or inland alkali...
4 key key fna 1932 key 1405 NaN NaN 1 Aerial stem arising from elongate, fascicled...
... ... ... ... ... ... ... ... ... ...
2613 NaN NaN budds 433 taxon_identification 26 743.0 NaN Cypripedium lady's-slipper
2614 key morphology fna 2308 key 560 NaN NaN 1 Inflorescences solitary flowers; sepals simi...
2615 taxon_identification taxon_identification fna 8174 taxon_identification 258 NaN NaN ericaceae family Jussieu unknown Monotropoidea...
2616 habitat habitat fna 12068 habitat 159 NaN NaN Bogs, muskeg, wet meadows, mossy woods, boreal...
2617 morphology morphology bc 3426 morphology 105 NaN Antennaria rosea ssp. confinis Fruits Achenes 0.6-1.8 mm long, glabrous or wa...

2618 rows × 9 columns

In [18]:
incorrect = results[results.classification != results.predictions]
incorrect_data_frame = results_flora_data_frame.iloc[incorrect.index]
incorrect_data_frame.to_csv(path_or_buf = "incorrect_dtm_clean.csv")
incorrect_data_frame
Out[18]:
classification predictions dataset_name row_id classification length row species text
425 taxon_identification morphology bc 4245 taxon_identification 40 NaN Erythronium oregonum ssp. oregonum white fawn lily (giant white fawnlily)
397 key morphology budds 21 key 79 136.0 NaN Rootlets solitary, one to a frond Lemna\n\nRoo...
1668 taxon_identification habitat bc 4999 taxon_identification 41 NaN Cuscuta pacifica var. pacifica Cuscuta pacifica Engelm. var. pacifica
613 morphology key bc 2896 morphology 62 NaN Dryas integrifolia ssp. integrifolia Fruits Numerous achenes with long (to 4 cm), f...
2087 key morphology budds 53 key 179 28.0 NaN Megaspores 0.3-0.5 mm in diam, white or\nbluis...
1559 morphology key bc 2973 morphology 110 NaN Calamagrostis montanensis Leaves Leaf blades 2-3 (4) mm wide, seldom rea...
1143 morphology key fna 4335 morphology 40 NaN NaN Leaves with concolorous, cucullate tips.
287 key morphology fna 2963 key 345 NaN NaN 1 Plants 80–200 cm; leaves usually alternate d...
2495 morphology key fna 5728 morphology 170 NaN NaN Leaves: apex generally callous. Inflorescences...
1992 key morphology budds 180 key 105 137.0 NaN Fronds short-stalked or stalkless, floating L....
836 morphology key bc 2243 morphology 61 NaN Calla palustris Fruits Berry-like, egg-shaped, red, 5 mm long;...
2095 taxon_identification morphology bc 5397 taxon_identification 20 NaN Agrostis stolonifera creeping bentgrass
2614 key morphology fna 2308 key 560 NaN NaN 1 Inflorescences solitary flowers; sepals simi...
1733 key morphology budds 4 key 153 47.0 NaN Flowers in a bracted, few-flowered raceme;\nst...
2128 morphology key bc 3069 morphology 417 NaN Geum rivale Leaves Basal leaves several, long-stalked, int...
1578 morphology taxon_identification budds 324 morphology 68 311.0 NaN Valuable forage grasses. Many species grow ver...
1269 key morphology fna 2581 key 430 NaN NaN 1 Stems hairy; leaves mostly cauline, blades 5...
1283 key morphology bc 368 key 173 NaN Pinus ponderosa 4. Cones sessile or subsessile, 5-25 cm long,...
134 key morphology fna 1544 key 203 NaN NaN 1 Calyx glabrous; seeds papillose; petals 6–10...
76 morphology key bc 2555 morphology 70 NaN Ambrosia chamissonis Fruits Achenes enclosed within the involucral ...
1279 key morphology fna 689 key 330 NaN NaN 1 Petals 6–12(–15), ascending to apically recu...

Run with strict cleaning regime

In [19]:
# ==== DTM =====
dtm_text_counts = build_dtm_text_counts(features.flora_tokenizer_clean, tokenized_stop_words_clean, flora_data_frame)
dtm_y_test, dtm_predictions = run_model(dtm_text_counts, flora_data_frame)

# ==== TFIDF =====
tfidf_text_counts = build_tfidf_text_counts(features.flora_tokenizer_clean, tokenized_stop_words_clean, flora_data_frame)
tfidf_y_test, tfidf_predictions = run_model(tfidf_text_counts, flora_data_frame)
/Users/jocelynpender/miniconda3/envs/flora-prepper/lib/python3.7/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['becau'] not in stop_words.
  'stop_words.' % sorted(inconsistent))
MultinomialNB Accuracy: 0.8867684478371501
/Users/jocelynpender/miniconda3/envs/flora-prepper/lib/python3.7/site-packages/sklearn/feature_extraction/text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['becau'] not in stop_words.
  'stop_words.' % sorted(inconsistent))
MultinomialNB Accuracy: 0.7811704834605598
In [20]:
dtm_y_test_df = pd.DataFrame(dtm_y_test).reset_index()
dtm_predictions_series = pd.Series(dtm_predictions)
results = pd.concat([dtm_y_test_df, dtm_predictions_series], axis=1)
results.rename(columns={0: 'predictions'}, inplace=True)
results = results.set_index('index')
results_flora_data_frame = pd.concat([results, flora_data_frame], axis=1)
results_flora_data_frame
Out[20]:
classification predictions dataset_name row_id classification length row species text
0 NaN NaN bc 2538 morphology 132 NaN Hordeum vulgare Leaves Sheaths open, smooth; blades mostly (5)...
1 key key fna 2670 key 223 NaN NaN 1 Radial spines usually 8-15 per areole; fruit...
2 NaN NaN fna 2770 key 564 NaN NaN 1 Leaves at flowering all or mostly cauline mo...
3 habitat habitat bc 6561 habitat 164 NaN Spergularia salina var. salina Wet to moist coastal mudflats or inland alkali...
4 key key fna 1932 key 1405 NaN NaN 1 Aerial stem arising from elongate, fascicled...
... ... ... ... ... ... ... ... ... ...
2613 NaN NaN budds 433 taxon_identification 26 743.0 NaN Cypripedium lady's-slipper
2614 key morphology fna 2308 key 560 NaN NaN 1 Inflorescences solitary flowers; sepals simi...
2615 taxon_identification taxon_identification fna 8174 taxon_identification 258 NaN NaN ericaceae family Jussieu unknown Monotropoidea...
2616 habitat habitat fna 12068 habitat 159 NaN NaN Bogs, muskeg, wet meadows, mossy woods, boreal...
2617 morphology morphology bc 3426 morphology 105 NaN Antennaria rosea ssp. confinis Fruits Achenes 0.6-1.8 mm long, glabrous or wa...

2618 rows × 9 columns

In [21]:
incorrect = results[results.classification != results.predictions]
incorrect_data_frame = results_flora_data_frame.iloc[incorrect.index]
incorrect_data_frame.to_csv(path_or_buf = "incorrect_dtm_dirty.csv")
incorrect_data_frame
Out[21]:
classification predictions dataset_name row_id classification length row species text
2049 key morphology bc 1169 key 117 NaN NaN 4.\tStems with well-defined infrastipular pric...
192 key morphology bc 1332 key 24 NaN NaN 1. Plants perennials.
425 taxon_identification morphology bc 4245 taxon_identification 40 NaN Erythronium oregonum ssp. oregonum white fawn lily (giant white fawnlily)
397 key morphology budds 21 key 79 136.0 NaN Rootlets solitary, one to a frond Lemna\n\nRoo...
2416 key morphology fna 1598 key 416 NaN NaN 1 Pseudobulbs not elongate, suborbicular, stro...
... ... ... ... ... ... ... ... ... ...
1004 key morphology budds 7 key 293 31.0 NaN 1. Leaves acute, not bristle-tipped, rootlets\...
2487 morphology key fna 4908 morphology 2345 NaN NaN Plants forming mats, cushions, or loosely caes...
1790 taxon_identification morphology bc 5109 taxon_identification 44 NaN Dichanthelium acuminatum western witchgrass (tapered rosette grass)
1453 morphology key fna 4636 morphology 632 NaN NaN Rhizomes shallowly set, 1–2 cm thick. Stems ar...
2050 key morphology budds 9 key 235 35.0 NaN Branchlets not hairy; cones falling the first\...

89 rows × 9 columns

Run a model based on text length

In [22]:
# Process text, remove stopwords. Remove empty cells.
length_processed_flora_data_frame = features.process_length_in_place(flora_data_frame, tokenized_stop_words)

plot = length_processed_flora_data_frame['length'].hist(by=length_processed_flora_data_frame['classification'])
plt.show()

It looks like discussion should be removed from the dataset. It is curiously short in length. This may be an artifact from the bc dataset.

In [23]:
length_custom_vec = CountVectorizer(lowercase=True, tokenizer=features.flora_tokenizer, stop_words=tokenized_stop_words,
                                 ngram_range=(1, 1))
length_text_counts = length_custom_vec.fit_transform(length_processed_flora_data_frame['text'])

length_model_sparse = features.prepare_length_features(length_text_counts, length_custom_vec, length_processed_flora_data_frame)

X_test, predicted = run_model(length_model_sparse, length_processed_flora_data_frame)
MultinomialNB Accuracy: 0.9337579617834395
In [24]:
#fig,ax = plt.subplots(figsize=(5,5))
#plt.boxplot(df_train_1.phrase_len)
#plt.show()

To do plots: classification coloured by source

Run a model with only the most frequently occurring words

In [25]:
all_text = " ".join(text_string for text_string in flora_data_frame.text)
all_text = features.flora_tokenizer(all_text)
top_words_text = features.find_most_frequent_words(all_text, threshold=2000)
top_words_flora_data_frame = features.filter_data_frame_top_words(flora_data_frame, top_words_text, tokenized_stop_words)
top_words_flora_data_frame
Out[25]:
dataset_name row_id classification length row species text
0 bc 2538 morphology 132 NaN Hordeum vulgare leaves sheaths open, smooth; blades mostly (5)...
1 fna 2670 key 223 NaN NaN 1 radial spines usually 8-15 per areole; fruit...
2 fna 2770 key 564 NaN NaN 1 leaves flowering mostly cauline 1 1 leaves f...
3 bc 6561 habitat 164 NaN Spergularia salina var. salina wet moist coastal inland alkaline saline ponds...
4 fna 1932 key 1405 NaN NaN 1 aerial stem arising elongate, roots; tall, c...
... ... ... ... ... ... ... ...
2613 budds 433 taxon_identification 26 743.0 NaN
2614 fna 2308 key 560 NaN NaN 1 inflorescences solitary flowers; sepals simi...
2615 fna 8174 taxon_identification 258 NaN NaN ericaceae family jussieu subfamily arnott 1832...
2616 fna 12068 habitat 159 NaN NaN bogs, muskeg, wet meadows, woods, boreal fores...
2617 bc 3426 morphology 105 NaN Antennaria rosea ssp. confinis fruits achenes mm long, glabrous warty; pappus...

2618 rows × 7 columns

In [26]:
all_text_custom_vec = CountVectorizer(lowercase=True, tokenizer=features.flora_tokenizer, stop_words=tokenized_stop_words,
                                 ngram_range=(1, 1))
all_text_counts = all_text_custom_vec.fit_transform(top_words_flora_data_frame['text'])
X_test, predicted = run_model(all_text_counts, top_words_flora_data_frame)
MultinomialNB Accuracy: 0.9325699745547074